library(tidyverse)
library(data.table)
library(dtplyr)
library(lubridate)
library(stringr)
library(zoo)
library(here)
library(sassy)
library(RPostgres)

# Open the log
lf <- log_open(file.path(here("output", "log"), "log_B_CompustatAnnual.log"), autolog = T, show_notes = F)

# Send code to the log
log_code()

# 1 Read data from WRDS -------------------------------------------------------------------
sep("1 Read data from WRDS")

wrds <- dbConnect(Postgres(),
                  host='wrds-pgdata.wharton.upenn.edu',
                  port=9737,
                  dbname='wrds',
                  sslmode='require',
                  user='...'). # input your username

SQL_statement <- 
  "
  SELECT a.gvkey, a.datadate, a.conm, a.fyear, a.tic, a.cusip, a.naicsh, a.sich, 
	        a.aco,a.act,a.ajex,a.am,a.ao,a.ap,a.at,a.capx,a.ceq,a.ceqt,a.che,a.cogs,
		      a.csho,a.cshrc,a.dcpstk,a.dcvt,a.dlc,a.dlcch,a.dltis,a.dltr,
		      a.dltt,a.dm,a.dp,a.drc,a.drlt,a.dv,a.dvc,a.dvp,a.dvpa,a.dvpd,
		      a.dvpsx_c,a.dvt,a.ebit,a.ebitda,a.emp,a.epspi,a.epspx,a.fatb,a.fatl,
		      a.ffo,a.fincf,a.fopt,a.gdwl,a.gdwlia,a.gdwlip,a.gwo,a.ib,a.ibcom,
		      a.intan,a.invt,a.ivao,a.ivncf,a.ivst,a.lco,a.lct,a.lo,a.lt,a.mib,
		      a.msa,a.ni,a.nopi,a.oancf,a.ob,a.oiadp,a.oibdp,a.pi,a.ppenb,a.ppegt,
		      a.ppenls,
		      a.ppent,a.prcc_c,a.prcc_f,a.prstkc,a.prstkcc,a.pstk,a.pstkl,a.pstkrv,
		      a.re,a.rect,a.recta,a.revt,a.sale,a.scstkc,a.seq,a.spi,a.sstk,
		      a.tstkp,a.txdb,a.txdi,a.txditc,a.txfo,a.txfed,a.txp,a.txt,
		      a.wcap,a.wcapch,a.xacc,a.xad,a.xint,a.xrd,a.xpp,a.xsga
	FROM COMP.FUNDA as a
	WHERE a.consol = 'C'
	AND a.popsrc = 'D'
	AND a.datafmt = 'STD'
	AND a.curcd = 'USD'
	AND a.indfmt = 'INDL'
"

res <- dbSendQuery(conn = wrds, statement = SQL_statement)
df_CompustatAnnual <- dbFetch(res)
dbClearResult(res)

# 2 Some cleaning -------------------------------------------------------------------
sep("2 Some cleaning")

# Step 1: Drop rows with missing values in specific columns
df_CompustatAnnual <- df_CompustatAnnual %>%
  filter(!is.na(at) & !is.na(prcc_c) & !is.na(ni))

# Step 2: Generate a new column 'cnum' with the first 6 characters of 'cusip'
df_CompustatAnnual <- df_CompustatAnnual %>%
  mutate(cnum = substr(cusip, 1, 6))

# Initialize 'dr' (deferred revenue) column with missing values
# drc = deferred revenue current 
# drlt = deferred revenue long-term
df_CompustatAnnual <- df_CompustatAnnual %>%
  mutate(dr = NA) %>%
  mutate(dr = ifelse(!is.na(drc) & !is.na(drlt), drc + drlt, dr)) %>%
  mutate(dr = ifelse(!is.na(drc) & is.na(drlt), drc, dr)) %>%
  mutate(dr = ifelse(is.na(drc) & !is.na(drlt), drlt, dr))


# Initialize 'dc' column with missing values
# dcpstk = convertible debt and preferred stock, pstk = preferred stock, dcvt = convertible debt
df_CompustatAnnual <- df_CompustatAnnual %>%
  mutate(dc = NA) %>%
  mutate(dc = ifelse(dcpstk > pstk & !is.na(pstk) & !is.na(dcpstk) & is.na(dcvt), dcpstk - pstk, dc)) %>%
  mutate(dc = ifelse(is.na(pstk) & !is.na(dcpstk) & is.na(dcvt), dcpstk, dc)) %>% 
  mutate(dc = ifelse(is.na(dc), dcvt, dc))

# xint = interest and related expense
df_CompustatAnnual <- df_CompustatAnnual %>%
  mutate(xint0 = ifelse(!is.na(xint), xint, 0))

# xsga = Selling, general and administrative expenses
df_CompustatAnnual <- df_CompustatAnnual %>%
  mutate(xsga0 = ifelse(!is.na(xsga), xsga, 0))

# Initialize xad0 column with 0
df_CompustatAnnual <- df_CompustatAnnual %>%
  mutate(xad0 = ifelse(!is.na(xad), xad, 0))

# Define the columns to replace missing values with 0
zero_cols <- c("nopi", "dvt", "ob", "dm", "dc", "aco", "ap", "intan", "ao", "lco", "lo", "rect", 
               "invt", "drc", "spi", "gdwl", "che", "dp", "act", "lct", "tstkp", "dvpa", "scstkc", 
               "sstk", "mib", "ivao", "prstkc", "prstkcc", "txditc", "ivst")

# Replace missing values with 0 in the specified columns using %>% (pipe)
df_CompustatAnnual <- df_CompustatAnnual %>%
  mutate(across(all_of(zero_cols), ~ifelse(is.na(.), 0, .)))


# 3 Merge with CRSP linking table ------------------------------------------------------------------
sep("3 Merge with CRSP linking table")

df_CCMLinkingTable <- readRDS(here("data", "Link", "CCMLinkingTable.RDS"))

df_CompustatAnnual <- df_CompustatAnnual %>% inner_join(df_CCMLinkingTable, by = "gvkey")
df_CompustatAnnual <- df_CompustatAnnual %>%
  filter(timelinkstart_d <= datadate & (datadate <= timelinkend_d | is.na(timelinkend_d)))

# Assuming 6-month reporting lag
df_CompustatAnnual <- df_CompustatAnnual %>%
  mutate(time_avail_m = as.Date(datadate) %m+% months(6)) %>% 
  mutate(time_avail_m = year(time_avail_m) * 100 + month(time_avail_m))

# Annual version: Remove unwanted columns
df_CompustatAnnual <- df_CompustatAnnual %>%
  select(-timelinkstart_d, -timelinkend_d, -linkprim, -linktype, -liid)

df_CompustatAnnual <- df_CompustatAnnual %>% select(gvkey, permno, everything())

# save
fwrite(df_CompustatAnnual, here("data", "Compustat", "a_aCompustat.csv"))
saveRDS(df_CompustatAnnual, here("data", "Compustat", "a_aCompustat.RDS"))

# 4 Monthly version ------------------------------------------------------------------
sep("4 Monthly version")

df_m_aCompustat <- df_CompustatAnnual

setDT(df_m_aCompustat)
# Expand each row by repeating 12 times
df_m_aCompustat <- df_m_aCompustat[, .SD[rep(1:.N, each = 12)], by = .(gvkey)] 
# Create 'tempTime' column as a copy of 'time_avail_m'
df_m_aCompustat[, tempTime := time_avail_m] 

# Update 'time_avail_m' based on group-specific calculations
df_m_aCompustat[, datadate2 := datadate %m+% months(0:(.N - 1)), by = .(gvkey, tempTime)]
df_m_aCompustat[, datadate2 := as.Date(datadate2) %m+% months(6)] 
df_m_aCompustat[, time_avail_m := year(datadate2)*100+month(datadate2)]

# Drop 'temp' and 'tempTime' columns
df_m_aCompustat <- df_m_aCompustat %>% 
  select(-tempTime, datadate2)

# Keep rows based on conditions using .I
# this affects .14% of observations that had changes of fiscal year ends.
# In that case, we keep the more recent info
df_m_aCompustat <- df_m_aCompustat %>%
  arrange(gvkey, time_avail_m, datadate) %>%
  group_by(gvkey, time_avail_m) %>%
  filter(row_number() == n()) %>%
  ungroup() %>% 
  as.data.table()

# This affects an additional 89/3m observation
df_m_aCompustat <- df_m_aCompustat %>%
  arrange(permno, time_avail_m, datadate) %>%
  group_by(permno, time_avail_m) %>%
  filter(row_number() == n()) %>%
  ungroup() %>% 
  as.data.table()

# save
fwrite(df_m_aCompustat, here("data", "Intermediate", "m_aCompustat.csv"))
saveRDS(df_m_aCompustat, here("data", "Intermediate", "m_aCompustat.RDS"))

# Close log
log_close()

# View results
writeLines(readLines(lf))
